From 47e2ef34b0c3dbecfc80b84acad9d2bb0baac1e2 Mon Sep 17 00:00:00 2001 From: Brad Jorsch Date: Wed, 13 Jul 2016 11:30:37 -0400 Subject: [PATCH] Only store currently-existing categories in the categories table A "currently-existing category" is defined as a category that either contains any pages or has a description page. Thus: * Category::initialize() now schedules an update to insert a row if the title exits but the row is missing. * Category::refreshCounts() now removes the row if the title doesn't exist and the category is empty. * WikiPage::onArticleCreate() loads the Category object, to trigger bullet #1. * WikiPage::updateCategoryCounts() refreshes the counts if it results in the row showing 0 pages, to trigger bullet #2. * LinksDeletionUpdate refreshes the counts if the row shows 0 pages, to trigger bullet #2. A maintenance script is provided to update the category table for this new definition. Bug: T28411 Bug: T50824 Change-Id: I0f0adf124c181ae5d3c7c95b3b5fb275a725794c --- autoload.php | 1 + includes/Category.php | 49 ++++-- includes/deferred/LinksDeletionUpdate.php | 17 +- includes/installer/DatabaseUpdater.php | 1 + includes/page/WikiPage.php | 24 +++ maintenance/cleanupEmptyCategories.php | 204 ++++++++++++++++++++++ maintenance/mssql/tables.sql | 6 +- maintenance/tables.sql | 6 +- 8 files changed, 287 insertions(+), 21 deletions(-) create mode 100644 maintenance/cleanupEmptyCategories.php diff --git a/autoload.php b/autoload.php index d82d6993c6..9dff36a5b8 100644 --- a/autoload.php +++ b/autoload.php @@ -245,6 +245,7 @@ $wgAutoloadLocalClasses = [ 'ClassCollector' => __DIR__ . '/includes/utils/AutoloadGenerator.php', 'CleanupAncientTables' => __DIR__ . '/maintenance/cleanupAncientTables.php', 'CleanupBlocks' => __DIR__ . '/maintenance/cleanupBlocks.php', + 'CleanupEmptyCategories' => __DIR__ . '/maintenance/cleanupEmptyCategories.php', 'CleanupPreferences' => __DIR__ . '/maintenance/cleanupPreferences.php', 'CleanupRemovedModules' => __DIR__ . '/maintenance/cleanupRemovedModules.php', 'CleanupSpam' => __DIR__ . '/maintenance/cleanupSpam.php', diff --git a/includes/Category.php b/includes/Category.php index 28b566a7f9..531e0be998 100644 --- a/includes/Category.php +++ b/includes/Category.php @@ -79,6 +79,11 @@ class Category { $this->mSubcats = 0; $this->mFiles = 0; + # If the title exists, call refreshCounts to add a row for it. + if ( $this->mTitle->exists() ) { + DeferredUpdates::addCallableUpdate( [ $this, 'refreshCounts' ] ); + } + return true; } else { return false; # Fail @@ -331,21 +336,35 @@ class Category { [ 'LOCK IN SHARE MODE' ] ); + $shouldExist = $result->pages > 0 || $this->getTitle()->exists(); + if ( $this->mID ) { - # The category row already exists, so do a plain UPDATE instead - # of INSERT...ON DUPLICATE KEY UPDATE to avoid creating a gap - # in the cat_id sequence. The row may or may not be "affected". - $dbw->update( - 'category', - [ - 'cat_pages' => $result->pages, - 'cat_subcats' => $result->subcats, - 'cat_files' => $result->files - ], - [ 'cat_title' => $this->mName ], - __METHOD__ - ); - } else { + if ( $shouldExist ) { + # The category row already exists, so do a plain UPDATE instead + # of INSERT...ON DUPLICATE KEY UPDATE to avoid creating a gap + # in the cat_id sequence. The row may or may not be "affected". + $dbw->update( + 'category', + [ + 'cat_pages' => $result->pages, + 'cat_subcats' => $result->subcats, + 'cat_files' => $result->files + ], + [ 'cat_title' => $this->mName ], + __METHOD__ + ); + } else { + # The category is empty and has no description page, delete it + $dbw->delete( + 'category', + [ 'cat_title' => $this->mName ], + __METHOD__ + ); + $this->mID = false; + } + } elseif ( $shouldExist ) { + # The category row doesn't exist but should, so create it. Use + # upsert in case of races. $dbw->upsert( 'category', [ @@ -362,6 +381,8 @@ class Category { ], __METHOD__ ); + // @todo: Should we update $this->mID here? Or not since Category + // objects tend to be short lived enough to not matter? } $dbw->endAtomic( __METHOD__ ); diff --git a/includes/deferred/LinksDeletionUpdate.php b/includes/deferred/LinksDeletionUpdate.php index a7c39ca625..b60ed8ac81 100644 --- a/includes/deferred/LinksDeletionUpdate.php +++ b/includes/deferred/LinksDeletionUpdate.php @@ -61,6 +61,8 @@ class LinksDeletionUpdate extends SqlDataUpdate implements EnqueueableDataUpdate // This handles the case when updates have to batched into several COMMITs. $scopedLock = LinksUpdate::acquirePageLock( $this->mDb, $id ); + $title = $this->page->getTitle(); + // Delete restrictions for it $this->mDb->delete( 'page_restrictions', [ 'pr_page' => $id ], __METHOD__ ); @@ -80,6 +82,20 @@ class LinksDeletionUpdate extends SqlDataUpdate implements EnqueueableDataUpdate } } + // Refresh the category table entry if it seems to have no pages. Check + // master for the most up-to-date cat_pages count. + if ( $title->getNamespace() === NS_CATEGORY ) { + $row = $this->mDb->selectRow( + 'category', + [ 'cat_id', 'cat_title', 'cat_pages', 'cat_subcats', 'cat_files' ], + [ 'cat_title' => $title->getDBkey(), 'cat_pages <= 0' ], + __METHOD__ + ); + if ( $row ) { + $cat = Category::newFromRow( $row, $title )->refreshCounts(); + } + } + // If using cascading deletes, we can skip some explicit deletes if ( !$this->mDb->cascadingDeletes() ) { // Delete outgoing links @@ -132,7 +148,6 @@ class LinksDeletionUpdate extends SqlDataUpdate implements EnqueueableDataUpdate // If using cleanup triggers, we can skip some manual deletes if ( !$this->mDb->cleanupTriggers() ) { - $title = $this->page->getTitle(); // Find recentchanges entries to clean up... $rcIdsForTitle = $this->mDb->selectFieldValues( 'recentchanges', diff --git a/includes/installer/DatabaseUpdater.php b/includes/installer/DatabaseUpdater.php index 6a20abc902..0d8137ce21 100644 --- a/includes/installer/DatabaseUpdater.php +++ b/includes/installer/DatabaseUpdater.php @@ -75,6 +75,7 @@ abstract class DatabaseUpdater { PopulateFilearchiveSha1::class, PopulateBacklinkNamespace::class, FixDefaultJsonContentPages::class, + CleanupEmptyCategories::class, ]; /** diff --git a/includes/page/WikiPage.php b/includes/page/WikiPage.php index b06b51967a..b64604e588 100644 --- a/includes/page/WikiPage.php +++ b/includes/page/WikiPage.php @@ -3279,6 +3279,14 @@ class WikiPage implements Page, IDBAccessObject { $title->touchLinks(); $title->purgeSquid(); $title->deleteTitleProtection(); + + if ( $title->getNamespace() == NS_CATEGORY ) { + // Load the Category object, which will schedule a job to create + // the category table row if necessary. Checking a slave is ok + // here, in the worst case it'll run an unnecessary recount job on + // a category that probably doesn't have many members. + Category::newFromTitle( $title )->getID(); + } } /** @@ -3525,6 +3533,22 @@ class WikiPage implements Page, IDBAccessObject { $cat = Category::newFromName( $catName ); Hooks::run( 'CategoryAfterPageRemoved', [ $cat, $this, $id ] ); } + + // Refresh counts on categories that should be empty now, to + // trigger possible deletion. Check master for the most + // up-to-date cat_pages. + if ( count( $deleted ) ) { + $rows = $dbw->select( + 'category', + [ 'cat_id', 'cat_title', 'cat_pages', 'cat_subcats', 'cat_files' ], + [ 'cat_title' => $deleted, 'cat_pages <= 0' ], + $method + ); + foreach ( $rows as $row ) { + $cat = Category::newFromRow( $row ); + $cat->refreshCounts(); + } + } } ); } diff --git a/maintenance/cleanupEmptyCategories.php b/maintenance/cleanupEmptyCategories.php new file mode 100644 index 0000000000..b8a246eb22 --- /dev/null +++ b/maintenance/cleanupEmptyCategories.php @@ -0,0 +1,204 @@ +addDescription( + <<addOption( + 'mode', + '"add" empty categories with description pages, "remove" empty categories ' + . 'without description pages, or "both"', + false, + true + ); + $this->addOption( + 'begin', + 'Only do categories whose names are alphabetically after the provided name', + false, + true + ); + $this->addOption( + 'throttle', + 'Wait this many milliseconds after each batch. Default: 0', + false, + true + ); + } + + protected function getUpdateKey() { + return 'cleanup empty categories'; + } + + protected function doDBUpdates() { + $mode = $this->getOption( 'mode', 'both' ); + $begin = $this->getOption( 'begin', '' ); + $throttle = $this->getOption( 'throttle', 0 ); + + if ( !in_array( $mode, [ 'add', 'remove', 'both' ] ) ) { + $this->output( "--mode must be 'add', 'remove', or 'both'.\n" ); + return false; + } + + $dbw = $this->getDB( DB_MASTER ); + + $throttle = intval( $throttle ); + + if ( $mode === 'add' || $mode === 'both' ) { + if ( $begin !== '' ) { + $where = [ 'page_title > ' . $dbw->addQuotes( $begin ) ]; + } else { + $where = []; + } + + $this->output( "Adding empty categories with description pages...\n" ); + while ( true ) { + # Find which category to update + $rows = $dbw->select( + [ 'page', 'category' ], + 'page_title', + array_merge( $where, [ + 'page_namespace' => NS_CATEGORY, + 'cat_title' => null, + ] ), + __METHOD__, + [ + 'ORDER BY' => 'page_title', + 'LIMIT' => $this->mBatchSize, + ], + [ + 'category' => [ 'LEFT JOIN', 'page_title = cat_title' ], + ] + ); + if ( !$rows || $rows->numRows() <= 0 ) { + # Done, hopefully. + break; + } + + foreach ( $rows as $row ) { + $name = $row->page_title; + $where = [ 'page_title > ' . $dbw->addQuotes( $name ) ]; + + # Use the row to update the category count + $cat = Category::newFromName( $name ); + if ( !is_object( $cat ) ) { + $this->output( "The category named $name is not valid?!\n" ); + } else { + $cat->refreshCounts(); + } + } + $this->output( "--mode=$mode --begin=$name\n" ); + + wfWaitForSlaves(); + usleep( $throttle * 1000 ); + } + + $begin = ''; + } + + if ( $mode === 'remove' || $mode === 'both' ) { + if ( $begin !== '' ) { + $where = [ 'cat_title > ' . $dbw->addQuotes( $begin ) ]; + } else { + $where = []; + } + $i = 0; + + $this->output( "Removing empty categories without description pages...\n" ); + while ( true ) { + # Find which category to update + $rows = $dbw->select( + [ 'category', 'page' ], + 'cat_title', + array_merge( $where, [ + 'page_title' => null, + 'cat_pages' => 0, + ] ), + __METHOD__, + [ + 'ORDER BY' => 'cat_title', + 'LIMIT' => $this->mBatchSize, + ], + [ + 'page' => [ 'LEFT JOIN', [ + 'page_namespace' => NS_CATEGORY, 'page_title = cat_title' + ] ], + ] + ); + if ( !$rows || $rows->numRows() <= 0 ) { + # Done, hopefully. + break; + } + foreach ( $rows as $row ) { + $name = $row->cat_title; + $where = [ 'cat_title > ' . $dbw->addQuotes( $name ) ]; + + # Use the row to update the category count + $cat = Category::newFromName( $name ); + if ( !is_object( $cat ) ) { + $this->output( "The category named $name is not valid?!\n" ); + } else { + $cat->refreshCounts(); + } + } + + $this->output( "--mode=remove --begin=$name\n" ); + + wfWaitForSlaves(); + usleep( $throttle * 1000 ); + } + } + + $this->output( "Category cleanup complete.\n" ); + + return true; + } +} + +$maintClass = 'CleanupEmptyCategories'; +require_once RUN_MAINTENANCE_IF_MAIN; diff --git a/maintenance/mssql/tables.sql b/maintenance/mssql/tables.sql index 12cfed8e71..48b2250fb4 100644 --- a/maintenance/mssql/tables.sql +++ b/maintenance/mssql/tables.sql @@ -336,9 +336,9 @@ CREATE INDEX /*i*/cl_timestamp ON /*_*/categorylinks (cl_to,cl_timestamp); CREATE INDEX /*i*/cl_collation_ext ON /*_*/categorylinks (cl_collation, cl_to, cl_type, cl_from); -- --- Track all existing categories. Something is a category if 1) it has an en- --- try somewhere in categorylinks, or 2) it once did. Categories might not --- have corresponding pages, so they need to be tracked separately. +-- Track all existing categories. Something is a category if 1) it has an entry +-- somewhere in categorylinks, or 2) it has a description page. Categories +-- might not have corresponding pages, so they need to be tracked separately. -- CREATE TABLE /*_*/category ( -- Primary key diff --git a/maintenance/tables.sql b/maintenance/tables.sql index 89aeb9c91d..9c9bdfb980 100644 --- a/maintenance/tables.sql +++ b/maintenance/tables.sql @@ -623,9 +623,9 @@ CREATE INDEX /*i*/cl_timestamp ON /*_*/categorylinks (cl_to,cl_timestamp); CREATE INDEX /*i*/cl_collation_ext ON /*_*/categorylinks (cl_collation, cl_to, cl_type, cl_from); -- --- Track all existing categories. Something is a category if 1) it has an en- --- try somewhere in categorylinks, or 2) it once did. Categories might not --- have corresponding pages, so they need to be tracked separately. +-- Track all existing categories. Something is a category if 1) it has an entry +-- somewhere in categorylinks, or 2) it has a description page. Categories +-- might not have corresponding pages, so they need to be tracked separately. -- CREATE TABLE /*_*/category ( -- Primary key -- 2.20.1